This document describes the analysis of oral microbial signatures in head and neck cancer patients, focusing on their longitudinal disease severity patterns. The analysis includes nonnegative PCA to derive a well-being score, clustering of patient curves, and linear mixed models to identify significant microbial features associated with disease severity.
# Load required packages
suppressPackageStartupMessages(library(PhiSpace))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(qs)) # quick read and write of R objects
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(seriation))
suppressPackageStartupMessages(library(cluster))
suppressPackageStartupMessages(library(factoextra))
suppressPackageStartupMessages(library(nlme))
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(rcartocolor))
# Load custom functions
source("0.utils.R")
metaDat <- read.csv(
paste0("data_filtered/meta_filtered.csv"), row.names = 1
)
metaDat <- metaDat %>%
select(
c(id, distress:teeth, timepoint)
)
Only two symptoms had missing values (i.e. taste, distress), so we impute them with the mean of the respective column. We first double-centered the data. That is, we first row-center \(X\) and then column-center it so that the double-centered version has zero row means and column means. We conducted the row-centering to the usually column-centering, so that different patients’ ratings at different time points were more comparable.
X <- metaDat %>% select(distress:teeth) %>% as.matrix()
for(icol in which(colSums(is.na(X)) > 0)){
icolVals <- X[,icol]
X[is.na(icolVals), icol] <- mean(icolVals, na.rm = T)
}
sum(is.na(X))
## [1] 0
Xcent <- PhiSpace::doubleCent(X)
To summarize patients’ symptom ratings, we conducted non-negative sparse principal component analysis (NSPCA) of the double-centered version of \(X\). We used NSPCA instead of conventional PCA since NSPCA ensures that the principal component (PC) loadings to be nonnegative and sparse, which provides more biological interpretability.
# perform non-negative sparse PCA
npcaRes0 <- nsprcomp::nsprcomp(Xcent, center = F, nneg = T)
npcaRes <- list(
scores = npcaRes0$x, loadings = npcaRes0$rotation
)
# Visualise the results
PhiSpace::matrixPlot(npcaRes$scores, comp_idx = 1:5, colBy = metaDat$mucositis, compName = "PC", returnPlotList = F)
## TableGrob (5 x 5) "arrange": 16 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (4-4,1-1) arrange gtable[layout]
## 5 5 (5-5,1-1) arrange gtable[layout]
## 6 6 (2-2,2-2) arrange gtable[layout]
## 7 7 (3-3,2-2) arrange gtable[layout]
## 8 8 (4-4,2-2) arrange gtable[layout]
## 9 9 (5-5,2-2) arrange gtable[layout]
## 10 10 (3-3,3-3) arrange gtable[layout]
## 11 11 (4-4,3-3) arrange gtable[layout]
## 12 12 (5-5,3-3) arrange gtable[layout]
## 13 13 (4-4,4-4) arrange gtable[layout]
## 14 14 (5-5,4-4) arrange gtable[layout]
## 15 15 (5-5,5-5) arrange gtable[layout]
## 16 16 (1-1,5-5) arrange gtable[guide-box]
PhiSpace::matrixPlot(npcaRes$scores, comp_idx = 1:5, colBy = metaDat$timepoint, compName = "PC", returnPlotList = F)
## TableGrob (5 x 5) "arrange": 16 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (4-4,1-1) arrange gtable[layout]
## 5 5 (5-5,1-1) arrange gtable[layout]
## 6 6 (2-2,2-2) arrange gtable[layout]
## 7 7 (3-3,2-2) arrange gtable[layout]
## 8 8 (4-4,2-2) arrange gtable[layout]
## 9 9 (5-5,2-2) arrange gtable[layout]
## 10 10 (3-3,3-3) arrange gtable[layout]
## 11 11 (4-4,3-3) arrange gtable[layout]
## 12 12 (5-5,3-3) arrange gtable[layout]
## 13 13 (4-4,4-4) arrange gtable[layout]
## 14 14 (5-5,4-4) arrange gtable[layout]
## 15 15 (5-5,5-5) arrange gtable[layout]
## 16 16 (1-1,5-5) arrange gtable[guide-box]
PhiSpace::loadBarplot(npcaRes$loadings, comp = "PC1", nfeat = 10)
PhiSpace::loadBarplot(npcaRes$loadings, comp = "PC2")
PhiSpace::loadBarplot(npcaRes$loadings, comp = "PC3")
# Save the figures for paper
p <- PhiSpace::matrixPlot(
npcaRes$scores, comp_idx = 1:2, colBy = metaDat$mucositis, compName = "PC",
returnPlotList = F, pointSize = 1, fsize = 7, legendTitle = ""
)
png(
paste0("figs/P12_legend.png"),
width = 2, height = 0.5, units = "in", res = 300
)
op <- par(mar = rep(0, 4))
suppressWarnings(grid::grid.draw(get_legend(p + theme(legend.position = "top"))))
par(op)
dev.off()
## quartz_off_screen
## 2
ggsave(
paste0("figs/PC12.png"), p + theme(legend.position = "none"), width = 3, height = 2.5
)
p <- PhiSpace::matrixPlot(
npcaRes$scores, comp_idx = 1, colBy = metaDat$mucositis, compName = "PC",
returnPlotList = F, pointSize = 1, fsize = 7, legendTitle = ""
)
ggsave(
paste0("figs/P1.png"), p + theme(legend.position = "none"), width = 3, height = 1.5
)
p <- PhiSpace::loadBarplot(npcaRes$loadings, comp = "PC1", nfeat = 8, fsize = 8)
ggsave(
paste0("figs/P1_load.png"), p, width = 2.5, height = 3
)
After examining the loadings of PC1–PC5, we chose the PC1 as the calibrated OM score for the following reasons. First, PC1 is positively correlated with the calibrated OM score. Moreover, PC1 summarises 7 different symptoms pertaining to OM, and hence provides a more comprehensive evaluation of calibrated OM score.
metaDat[,"OMscore_calib"] <- npcaRes$scores[,"PC1"]
# Visualise the calibrated OM score
metaDat <- metaDat %>% distinct(id, timepoint, .keep_all = TRUE)
ggplot(metaDat, aes(x = timepoint, y = OMscore_calib)) +
geom_line(aes(group = id), colour = "gray") +
geom_smooth() +
labs(x = "Day", y = "Disease severity score") +
theme_bw(base_size = 5)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggsave("figs/allCurves.png", width = 3, height = 2.5)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
For each patient at their respective observational time points, we used a functional data analysis approach to group the patients according to the longitudinal patterns of their calibrated OM scores.
For the \(i\)th patient we linearly interpolated the patient’s observed calibrated OM scores at discrete time points to obtain the patient’s calibrated OM score function as \(f_i(t)\) for \(t\in[\tau_i, T_i]\), where \(\tau_i\) and \(T_i\) the minimum (baseline) and maximum (discharge) time points of patient \(i\). We avoided extrapolating \(f_i\) outside its observational time range to retain the original baseline and discharge time points of patients.
# Interpolate calibrated OM scores at discrete time points
timeGrid <- seq(
min(metaDat$timepoint), max(metaDat$timepoint), 0.5
)
metaList <- split(metaDat, ~id)
interpVals <- lapply(
1:length(metaList),
function(ii){
subDF <- metaList[[ii]]
modPred <- approx(
x = subDF$timepoint, y = subDF[,"OMscore_calib"], xout = timeGrid,
)
return(modPred)
}
) %>% `names<-`(names(metaList))
X <- sapply(
interpVals, function(x) x$y
) %>% t()
We then calculated the pairwise distances between the patient’s calibrated OM score functions and found the number of clusters to be \(3\) based on the gap statistic.
distM <- distMiss(X)
X2clust <- as.matrix(distM)
#number of clusters according to Gap statistics
set.seed(8790)
gap_stat <- clusGap(X2clust, hcut, K.max=10, B = 100)
gapRes <- fviz_gap_stat(gap_stat)
gapRes
#visualise the calibrated OM score trajectories for each cluster
set.seed(8475)
hc <- hclust(dist(X2clust), method = "ward.D2")
plot(as.dendrogram(hc), main = "Dendrogram of Rows", xlab = "Rows", ylab = "Height")
k <- 3
clusters <- hcut(X2clust, k = k)$cluster
df <- X2clust %>% as.data.frame() %>% mutate(id = rownames(X2clust), cluster = as.factor(clusters))
metaDat_c <- merge(metaDat, df[,c("id", "cluster")], by = "id")
ggplot(metaDat_c, aes(x = timepoint, y = OMscore_calib, color = cluster, group = id)) +
geom_line(linewidth = 0.3) +
labs( x = "Day", y = "Disease severity score") +
theme_bw(base_size = 5) +
theme(
legend.title = element_text(face = "bold", size=9),
legend.text=element_text(size=8)
) +
scale_color_manual(labels = c("1 (n=31)", "2 (n=47)", "3 (n=62)"),
values = carto_pal(n=5, "Safe"))
ggsave("figs/curvesCluster.png", width = 3.5, height = 2.5)
ggplot(metaDat_c, aes(x = timepoint, y = OMscore_calib, color = cluster, group = id)) +
geom_line() +
labs( x = "Day", y = "Disease severity score") +
theme_bw(base_size = 5)+
facet_wrap(~ cluster)+
scale_color_manual(values = carto_pal(n=5, "Safe"))
metaDat_c%>% select(id, cluster) %>% unique()%>%select(cluster)%>%table()
## cluster
## 1 2 3
## 31 67 42
To investigate whether specific clinical features contributed to differences between patient groups, we first visualized marginal density plots for relevant variables such as age, weight, and BMI. Where visual differences between groups were apparent, we performed two-sample t-tests to formally assess differences across cluster combinations. Assumptions of normality and homogeneity of variance were checked prior to conducting the tests.
# Load patient characteristics
clFeat <- readr::read_csv("data/age_final_data.csv")
clFeat <- merge(df[,c("id","cluster")], clFeat, by = "id")
clFeat$cluster <- as.factor(clFeat$cluster)
clFeat$bmi <- clFeat$weight/clFeat$heightnew^2
table(clFeat$cluster)
##
## 1 2 3
## 29 66 42
clFeat %>% group_by(cluster) %>% summarise_all(mean, na.rm = TRUE)
## Warning: There were 9 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `id = (new("standardGeneric", .Data = function (x, ...) ...`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 8 remaining warnings.
table(clFeat$cluster)
##
## 1 2 3
## 29 66 42
clFeat %>% ggplot(aes(age)) + geom_density(aes(colour = cluster),show.legend = FALSE)+
theme_bw(base_size = 7)+
scale_color_manual(values = carto_pal(n=5, "Safe"))
ggsave("figs/age.png", width = 3, height = 2.5)
clFeat %>% ggplot(aes(weight)) + geom_density(aes(colour = cluster),show.legend = FALSE)+
theme_bw(base_size = 7)+
scale_color_manual(values = carto_pal(n=5, "Safe"))
ggsave("figs/weight.png", width = 3, height = 2.5)
clFeat %>% ggplot(aes(bmi)) + geom_density(aes(colour = cluster), show.legend = FALSE)+
theme_bw(base_size = 7)+
scale_color_manual(values = carto_pal(n=5, "Safe"))
# Age- (pvalue 0.04682)
data1 <- clFeat$age[clFeat$cluster == 1]
data2 <- clFeat$age[clFeat$cluster == 2]
# Normality ( pvalues 0.857, 0.3249)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.98076, p-value = 0.857
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.97679, p-value = 0.2513
# Equal variance (pvalue 0.927)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 0.93311, num df = 28, denom df = 65, p-value = 0.8636
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5140049 1.8371216
## sample estimates:
## ratio of variances
## 0.9331069
t.test(data1, data2, var.equal = T, alternative = "greater")
##
## Two Sample t-test
##
## data: data1 and data2
## t = 0.72152, df = 93, p-value = 0.2362
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -1.668132 Inf
## sample estimates:
## mean of x mean of y
## 59.68966 58.40909
# Age- (pvalue 0.1907)
data1 <- clFeat$age[clFeat$cluster == 1]
data2 <- clFeat$age[clFeat$cluster == 3]
# Normality ( pvalues 0.857, 0.3533)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.98076, p-value = 0.857
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.97668, p-value = 0.537
# Equal variance (pvalue 0.746)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 0.93019, num df = 28, denom df = 41, p-value = 0.8534
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4764973 1.9002115
## sample estimates:
## ratio of variances
## 0.9301912
t.test(data1, data2, var.equal = T, alternative = "greater")
##
## Two Sample t-test
##
## data: data1 and data2
## t = 2.0102, df = 69, p-value = 0.02416
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.6579815 Inf
## sample estimates:
## mean of x mean of y
## 59.68966 55.83333
# Age- (pvalue 0.8364)
data1 <- clFeat$age[clFeat$cluster == 2]
data2 <- clFeat$age[clFeat$cluster == 3]
# Normality ( pvalues 0.3249, 0.3533)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.97679, p-value = 0.2513
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.97668, p-value = 0.537
# Equal variance (pvalue 0.788)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 0.99688, num df = 65, denom df = 41, p-value = 0.9742
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5596967 1.7131704
## sample estimates:
## ratio of variances
## 0.9968754
t.test(data1, data2, var.equal = T, alternative = "greater")
##
## Two Sample t-test
##
## data: data1 and data2
## t = 1.6205, df = 106, p-value = 0.05405
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.06178801 Inf
## sample estimates:
## mean of x mean of y
## 58.40909 55.83333
# Weight (pvalue 0.02683)
data1 <- clFeat$weight[clFeat$cluster == 1]
data2 <- clFeat$weight[clFeat$cluster == 2]
# Normality ( pvalues 0.1304, 0.4293)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.94436, p-value = 0.1304
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.95385, p-value = 0.0154
# Equal variance (pvalue 0.7883)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 1.2629, num df = 28, denom df = 65, p-value = 0.436
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6956553 2.4863642
## sample estimates:
## ratio of variances
## 1.262869
t.test(data1, data2, var.equal = T, alternative = "less")
##
## Two Sample t-test
##
## data: data1 and data2
## t = -1.367, df = 93, p-value = 0.08746
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 2.608668
## sample estimates:
## mean of x mean of y
## 185.8276 197.9394
# Weight (pvalue 0.03713)
data1 <- clFeat$weight[clFeat$cluster == 1]
data2 <- clFeat$weight[clFeat$cluster == 3]
# Normality ( pvalues 0.1304, 0.006892)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.94436, p-value = 0.1304
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.95247, p-value = 0.07919
# Equal variance (pvalue 0.2749)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 1.3034, num df = 28, denom df = 41, p-value = 0.432
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6676696 2.6625823
## sample estimates:
## ratio of variances
## 1.303387
t.test(data1, data2, var.equal = T, alternative = "less")
##
## Two Sample t-test
##
## data: data1 and data2
## t = -2.6502, df = 69, p-value = 0.004984
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -9.47752
## sample estimates:
## mean of x mean of y
## 185.8276 211.3810
# Weight (pvalue 0.6888)
data1 <- clFeat$weight[clFeat$cluster == 2]
data2 <- clFeat$weight[clFeat$cluster == 3]
# Normality ( pvalues 0.4293, 0.006892)
shapiro.test(data1);shapiro.test(data2)
##
## Shapiro-Wilk normality test
##
## data: data1
## W = 0.95385, p-value = 0.0154
##
## Shapiro-Wilk normality test
##
## data: data2
## W = 0.95247, p-value = 0.07919
# Equal variance (pvalue 0.3523)
var.test(data1, data2)
##
## F test to compare two variances
##
## data: data1 and data2
## F = 1.0321, num df = 65, denom df = 41, p-value = 0.9283
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5794648 1.7736781
## sample estimates:
## ratio of variances
## 1.032084
t.test(data1, data2, var.equal = T, alternative = "less")
##
## Two Sample t-test
##
## data: data1 and data2
## t = -1.7896, df = 106, p-value = 0.03819
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.9779354
## sample estimates:
## mean of x mean of y
## 197.9394 211.3810
# Investigate the relationship between calibrated OM score and age, weigh
medOMscore_calib <- metaDat_c %>% group_by(id) %>% summarise(medOMscore_calib = median(OMscore_calib))
clFeat <- merge(medOMscore_calib, clFeat, by = "id")
clFeat %>% ggplot(aes(medOMscore_calib, weight)) + geom_line()
clFeat %>% ggplot(aes(medOMscore_calib, age)) + geom_line()
To identify microbial features associated with calibrated OM scores, we used a novel variable selection method, partial least squares knockoff (PLSKO). PLSKO is a knock-off variable selection method based on PLS regression. We opted for a knock-off-based approach for variable selection since knock-off approaches provide statistically principled false discovery control, whereas alternative methods such as lasso and sparse PLS lack. We first applied PLSKO to the whole dataset to identify OTUs correlated with the calibrated OM score across all patients. Then, we applied PLSKO to each individual cluster of patients to uncover group-specific OTUs.
# Prepare data
taxonomy_table <- read.csv("data/taxonomy_table.csv",
row.names = 1
)
microb <- readRDS("data_filtered/sce_all.rds")
selectedOTU <- readRDS("data_filtered/selectedOTUs.rds")
microb <- microb[selectedOTU, ]
microb <- PhiSpace::CLRnorm(microb)
# Add cluster labels
colData(microb)[,"cluster"] <- df[microb$id, "cluster"]
## PLSKO
patientIDs <- intersect(rownames(metaDat), colnames(microb))
X <- t(assay(microb, "data"))[patientIDs,] %>% as.matrix()
Y <- metaDat[patientIDs,"OMscore_calib"]
plskoRes_global_path <- "output/plsko/global.qs"
if(!file.exists(plskoRes_global_path)){
plsResAKO <- PLSKO::plsAKO(X, Y, ncores = 5, seed = 7639)
qsave(plsResAKO, plskoRes_global_path)
} else {
plsResAKO <- qread(plskoRes_global_path)
}
sel <- names(plsResAKO$ako.s)
genera <- taxonomy_table[sel,"genus"]
OTUids <- gsub("OTU", "", sel)
sig <- paste0(genera, OTUids)
plsRes <- PhiSpace::mvr(X, Y, ncomp = 2, method = "PLS", center = T)
impScores <- plsRes$coefficients[,,2]
load2plot <- impScores %>% as.matrix() %>% as.data.frame() %>% `colnames<-`("coef")
genera <- taxonomy_table[rownames(load2plot),"genus"]
OTUids <- gsub("OTU", "", rownames(load2plot))
rownames(load2plot) <- paste0(genera, OTUids)
p <- loadBarplot_v2(load2plot, comp = "coef", significant=sig, nfeat = 30, fsize = 7)
p
ggsave("figs/coefPlot_global.png", width = 1.8, height = 2.5)
knockoff_ls<-list(data.frame(taxa=sig,positive=as.numeric(load2plot[sig,]>0)))
plskoRes_group_path <- "output/plsko/group_list.qs"
dummy <- PhiSpace::codeY_vec(metaDat_c$cluster, rowNames = metaDat_c$id, method = "0,1")
nclust <- ncol(dummy)
if(!file.exists(plskoRes_group_path)){
groupPLSKOres <- lapply(
1:nclust, function(ii){
patientClust <- metaDat_c$id[metaDat_c$cluster == ii]
microbSub <- microb[,microb$id %in% patientClust]
patientIDs <- intersect(rownames(metaDat), colnames(microbSub))
X <- t(assay(microb, "data"))[patientIDs,] %>% as.matrix()
X <- X[,colVars(X)>1e-10]
Y <- metaDat[patientIDs,"OMscore_calib"]
plsRes <- PLSKO::plsAKO(X, Y, ncores = 5, seed = 7639)
sel <- names(plsRes$ako.s)
genera <- taxonomy_table[sel,"genus"]
OTUids <- gsub("OTU", "", sel)
return(paste0(genera, OTUids))
}
)
qsave(groupPLSKOres, plskoRes_group_path)
} else {
groupPLSKOres <- qread(plskoRes_group_path)
}
groupPLSres <- lapply(
1:nclust, function(ii){
patientClust <- metaDat_c$id[metaDat_c$cluster == ii]
microbSub <- microb[,microb$id %in% patientClust]
patientIDs <- intersect(rownames(metaDat), colnames(microbSub))
X <- t(assay(microb, "data"))[patientIDs,] %>% as.matrix()
X <- X[,colVars(X)>1e-10]
Y <- metaDat[patientIDs,"OMscore_calib"]
plsRes <- PhiSpace::mvr(X, Y, ncomp = 2, method = "PLS", center = T)
impScores <- plsRes$coefficients[,,2]
load2plot <- impScores %>% as.matrix() %>% as.data.frame() %>% `colnames<-`("coef")
genera <- taxonomy_table[rownames(load2plot),"genus"]
OTUids <- gsub("OTU", "", rownames(load2plot))
rownames(load2plot) <- paste0(genera, OTUids)
sig<-groupPLSKOres[[ii]]
knockoff_ls[[1+ii]]<<-data.frame(taxa=sig,positive=as.numeric(load2plot[sig,]>0))
loadBarplot_v2(load2plot, comp = "coef", significant=sig, nfeat = 30, fsize = 7)
}
)
p <- ggarrange(plotlist = groupPLSres, nrow = 1)
p
ggsave("figs/coefPlot_group.png", width = 5.6, height = 2.5)
# Combine knockoff_ls into a named list
knock_ls <- list(
All = knockoff_ls[[1]],
C1 = knockoff_ls[[2]],
C2 = knockoff_ls[[3]],
C3 = knockoff_ls[[4]]
)
# Get all unique taxa
all_taxa <- unique(unlist(lapply(knock_ls, function(x) x$taxa)))
# Create a matrix of values: 1 (positive), 0 (negative), NA (not selected)
taxa_matrix <- sapply(knock_ls, function(df) {
out <- rep(NA, length(all_taxa))
names(out) <- all_taxa
out[df$taxa] <- df$positive
out
})
# Reorder rows by same sorting logic
taxa_matrix <- taxa_matrix[order(
-rowSums(taxa_matrix),
-taxa_matrix[, "All"],
-taxa_matrix[, "C1"],
-taxa_matrix[, "C2"],
-taxa_matrix[, "C3"]
), ]
png(filename = "figs/clusterTable.png",
width=7, height=6, units="in", res=150)
pheatmap::pheatmap(taxa_matrix,
color = c("#0000FF80", "#FF000080"),
legend_breaks = c(0, 1),
legend_labels = c("Negative", "Positive"),
cluster_rows = FALSE,
cluster_cols = FALSE,
fontsize_row = 12,
border_color = "white",
na_col = "white" )
dev.off()
## quartz_off_screen
## 3
To investigate whether patient characteristics (such as age, height, and weight) and other factors (such as Timepoint and patient Cluster membership) were associated with microbial abundance, we applied linear mixed models (LMMs). LMMs extend standard linear models by including both fixed effects and random effects, making them particularly suitable for repeated-measures studies like this, which involve multiple observations from the same patient. In our models, age, height, weight, timepoint, and cluster were treated as fixed effects, while patients were modelled as random effects to account for dependencies in the data. We fitted separate LMMs for each taxon previously identified by PLSKO analysis as associated with the calibrated OM score, using CLR-transformed microbial abundance as the response variable.
patientIDs <- intersect(rownames(metaDat), colnames(microb))
X <- t(assay(microb, "data"))[patientIDs,] %>% as.matrix()
colnames(X) <- paste0(taxonomy_table[colnames(X),"genus"], gsub("OTU", "", colnames(X)))
rownames(metaDat_c)<- rownames(metaDat)
microb_meta<-merge(metaDat_c[,c("id", "timepoint","cluster","OMscore_calib")],X[, all_taxa], by = "row.names")%>%
merge(.,clFeat[c("id","sex", "weight", "bmi" , "heightnew" ,"age")], by="id")
microb_meta$sex<-factor(microb_meta$sex, levels = c("1","2"),
labels=c("Female", "Male"))
# Apply the model across all y's
models <- lapply(rownames(taxa_matrix), function(y) {
formula <- as.formula(paste(y, "~ timepoint + cluster + age + heightnew + weight"))
lme(formula, random = ~1 | id, data = microb_meta)
})
mat<-sapply(models, function(x){v<-summary(x)
sign(v$tTable[-1,1])*(-log(v$tTable[-1,5]))}) %>% t()
rownames(mat)<- rownames(taxa_matrix)
The resulting beta coefficients from these models represent the estimated effect size of each fixed effect on microbial abundance. The sign and magnitude of each beta coefficient indicate the direction and strength of the association for a given covariate (e.g., timepoint) while holding other covariates constant. To visualize these results, we plotted the values \(sign(\beta)\times-log(\beta)\) in a heatmap, highlighting the most significant factors influencing microbial abundance.
#-log(0.05)~3
# Define custom color function
# Define color function
col_fun <- colorRamp2(
breaks = c(-100, -3, -2.9, 0, 2.9 , 3, 100),
colors = c("darkorange4", "lightgoldenrod1", "grey","white","grey", "thistle1", "purple4")
)
Heatmap(mat,
name = "value",
col = col_fun,
cluster_rows = FALSE,
cluster_columns = FALSE,
show_row_dend = FALSE,
show_column_dend = FALSE,
rect_gp = gpar(col = "white", lwd = 2))
# save the heatmap
png(filename = "figs/lmmRes.png",
width=6, height=6, units="in", res=150)
Heatmap(mat,
name = "value",
col = col_fun,
cluster_rows = FALSE,
cluster_columns = FALSE,
show_row_dend = FALSE,
show_column_dend = FALSE,
rect_gp = gpar(col = "white", lwd = 2))
dev.off()
## quartz_off_screen
## 2